from os import path
import os
import re

rootdir = '/SogouC.reduced/Reduced'
dirs = os.listdir(rootdir)
dirs = [path.join(rootdir,f) for f in dirs if f.startswith('C')]

def load_txt(x):
    with open(x) as f:
        res = [t.decode('gbk','ignore') for t in f]
        return ''.join(res)

text_t = {}
for i, d in enumerate(dirs):
    files = os.listdir(d)
    files = [path.join(d, x) for x in files if x.endswith('txt') and not x.startswith('.')]
    text_t[i] = [load_txt(f) for f in files]

# to dataframe
import pandas as pd
import numpy as np

flen = [len(t) for t in text_t.values()]
labels = np.repeat(text_t.keys(),flen)

# flatter nested list
import itertools
merged = list(itertools.chain.from_iterable(text_t.values()))

df = pd.DataFrame({'label': labels, 'txt': merged})
df.head()


# cut character
def cutchar(x):
    words = list(x)
    return ' '.join(words)

df['seg_word'] = df.txt.map(cutchar)

from cPickle import dump,load
#dump(df, open('df.pickle', 'wb'))
df = load(open('df.pickle','rb'))

# 探索 转成nltk需要格式,建立list 
txt = df['seg_word'].values 
txtnltk = [] 
for sent in txt: 
    temp = [w for w in sent.split()] 
    txtnltk.extend(temp)

# nltk  
import nltk 
corpus = nltk.Text(txtnltk) 
from nltk.probability import FreqDist 
# 词频 
fdist = FreqDist(corpus) 
w = fdist.keys() 
v = fdist.values() 
freqdf = pd.DataFrame({'word':w,'freq':v}) 
freqdf.sort('freq',ascending =False, inplace=True)
freqdf['idx'] = np.arange(len(v))
freqdf.head()